knitr::opts_chunk$set(echo = TRUE)
#install.packages("corrgram")
library(tidyverse)
library(corrgram) # visualisation of correlations
library(lmtest)  # more linear regression tools
library(naniar) # missing data analysis
library(tsibble) # time series tibbles
library(fable) # tsibble-based forecasting
library(fabletools) #data structures for fable
library(feasts) # feature extraction for time series
library(hrbrthemes) #ggplot styling
library(GGally) # Pair plot
library(ggplot2) # ggplot 2
library(hrbrthemes) # theme for ggplot
ggplot2::theme_set(hrbrthemes::theme_ipsum_rc())

Introduction

Data description

The data file production_data.csv contain 400 observations on 8 variables. 6 of the variables are synthetic production rate of 10Be for the last 8000 years from different regions of the globe. These regional synthetic production rate are based on a production model that can simulate theoretical production rate at each latitude and height (km). We generate the production rate and combine them (summation) into different regions. Therefore, correlation are expected between neighboring regions. Beside the regional correlation, there are also time autocorrelation in the data set.
2 of the variables are observation (EDML and GRIP). They are 10Be flux measured from the EDML ice in Antarctica and the GRIP in Greenland.

Variable description

variable description
age time before 1950 [years]
Q_trop_high 10Be production rate in the troposphere at high latitude (60-90°) [atoms/cm2/s]
Q_trop_mid 10Be production rate in the troposphere at mid latitude (30-60°) [atoms/cm2/s]
Q_trop_low 10Be production rate in the troposphere at low latitude (0-30°) [atoms/cm2/s]
Q_stra_high 10Be production rate in the stratosphere at high latitude (60-90°) [atoms/cm2/s]
Q_stra_mid 10Be production rate in the stratosphere at mid latitude (30-60°) [atoms/cm2/s]
Q_stra_low 10Be production rate in the stratosphere at low latitude (0-30°) [atoms/cm2/s]
EDML 10Be flux [10^6 atoms/cm2/year] to Antarctica measured at the EDML ice core [normalized]
GRIP 10Be flux [10^6 atoms/cm2/year] to Greenland measured at the GRIP ice core [normalized]

The variables EDML and GRIP (10Be flux) were normalized to have the same mean as the sum of 10Be production rate from all the regions over the last 8000 years. The goal is to analyse how much each region contributes to the 10Be fluxes to either Greenland or Antarctica.

Exploratory Data Analysis

Correlation structure

First we load the data and look at it.

icecore <- read_csv("data/production_data.csv", show_col_types = FALSE)
head(icecore)
## # A tibble: 6 × 9
##     age Q_trop_high Q_trop_mid Q_trop_low Q_stra_h…¹ Q_str…² Q_str…³  EDML  GRIP
##   <dbl>       <dbl>      <dbl>      <dbl>      <dbl>   <dbl>   <dbl> <dbl> <dbl>
## 1     0       0.428      0.459      0.247       1.62   0.800  0.0783    NA    NA
## 2    20       0.456      0.479      0.249       1.77   0.849  0.0792    NA    NA
## 3    40       0.472      0.489      0.250       1.85   0.875  0.0795    NA    NA
## 4    60       0.468      0.485      0.248       1.83   0.866  0.0789    NA    NA
## 5    80       0.434      0.459      0.243       1.65   0.802  0.0768    NA    NA
## 6   100       0.427      0.453      0.241       1.61   0.788  0.0762    NA    NA
## # … with abbreviated variable names ¹​Q_stra_high, ²​Q_stra_mid, ³​Q_stra_low

Lets explore the correlation structure among the covariates.

# removing ID variable and the dependent variables for now
varnames<- setdiff(names(icecore), c("age", "EDML", "GRIP"))
corrgram::corrgram(icecore[,varnames], order="PCA",
         main="Correlogram for the icecore data",
         lower.panel=panel.ellipse, upper.panel=panel.pie,
         diag.panel=panel.minmax, text.panel=panel.txt)

All independent variables are strongly correlated between themselves. Reordering by PCA identifies the groupings which make sense (by latitude).

Missing values

There’s some missingness in the data. Let’s have a look

naniar::vis_miss(icecore)

The missingness is present only in the response variables EDML and GRIP and only the fist few observations, i.e. it is missing not-at-random.

Time aspect

We will take somewhat closer look at the time series aspect of the data. First of all, since this is the geology-related data, the age is measured backwards, which means that age=0 is actually the last (most recent) observation. Is the age equally spaced? Yes, it is equi-spaced by 20 years.

unique(diff(icecore$age))
## [1] 20

Let’s rearrange the data and add an index. We will turn this tibble into the time series object (using class tsibble).

icecore_df <- icecore %>% 
  mutate(time=-age, .before = 1) %>% 
  select(-age) %>% 
  arrange(time) 

Lets plot the response variables

icecore_df %>% 
  select(time, EDML, GRIP) %>% 
  pivot_longer(-time, names_to="core", values_to="value") %>%
  ggplot()+
  geom_line(aes(time,value, color=core))+
  facet_wrap(vars(core), ncol=1)
## Warning: Removed 77 rows containing missing values (`geom_line()`).

There does not appear to be a clear seasonality (after all the data is sampled every 20 years), but we can clearly see that there’s a trend.

Train-test split

First, we will create the training and test set, separating the period for which we don’t have the response variable, but have the predictor variables

idx_GRIP_train <- !is.na(icecore_df$GRIP)
idx_EDML_train <- !is.na(icecore_df$EDML)

GRIP_train_df <- icecore_df %>%
  select(-EDML) %>% 
  filter(idx_GRIP_train)
GRIP_train_ts <- GRIP_train_df %>%  
  as_tsibble(index = time)

GRIP_test_df <- icecore_df %>% 
  select(-GRIP, -EDML) %>% 
  filter(!idx_GRIP_train) 
GRIP_test_ts <- GRIP_test_df %>%
  as_tsibble(index = time)

EDML_train_df <- icecore_df %>%
  select(-GRIP) %>% 
  filter(idx_EDML_train)
EDML_train_ts <- EDML_train_df %>% 
  as_tsibble(index = time)

EDML_test_df <- icecore_df %>% 
  select(-GRIP, -EDML) %>% 
  filter(!idx_EDML_train) 
EDML_test_ts <- EDML_test_df %>%
  as_tsibble(index = time)

Modeling approaches

The methods we used in class up to this point are intended for the data that is independent and identically distributed (IID). This is the fundamental assumption behind the linear regression, correlation and the hypothesis testing (the theory behind the confindence intervals and p-values). The alternative approach would be to consider this data from the point of view of time-indexed series (TS) of observations, where the observations are in fact not independent. Both approaches are valuable for learning and we will adopt each one of them in the sections that follow.

IID

In this approach we only focus on the observation data in EDML.

Antarctica (EDML) ice core

Regression model

First, we need to address the possible time autocorrelation introduced by the response variable into the error term of our linear models. The time autocorrelation in EDML data is significant until after lag 3 (i.e. 3 data points away from the sampling point).

par(mar=c(4,4,4,4))
acf(EDML_train_df$EDML, main='EDML data')

We can attempt to reduce this by binning the data via taking the average of every 3 data points. Below is summary of the processed data set. It can be observed that the mean production rate of 10Be is lower in the troposphere compared to the stratosphere. Particularly, the production rate of the stratosphere is significant high at the mid and high latitudes.

EDML_dat <- EDML_train_df %>% 
  group_by(idx=row_number() %/% 3) %>%
  summarise_all(mean) %>% 
  select(-idx, -time)
# Summary
summary(EDML_dat)
##   Q_trop_high       Q_trop_mid       Q_trop_low      Q_stra_high   
##  Min.   :0.3806   Min.   :0.3773   Min.   :0.1856   Min.   :1.384  
##  1st Qu.:0.4363   1st Qu.:0.4187   1st Qu.:0.1963   1st Qu.:1.662  
##  Median :0.4661   Median :0.4516   Median :0.2131   Median :1.817  
##  Mean   :0.4730   Mean   :0.4626   Mean   :0.2227   Mean   :1.862  
##  3rd Qu.:0.5017   3rd Qu.:0.5026   3rd Qu.:0.2488   3rd Qu.:2.017  
##  Max.   :0.6048   Max.   :0.5759   Max.   :0.2829   Max.   :2.609  
##    Q_stra_mid       Q_stra_low           EDML      
##  Min.   :0.6361   Min.   :0.05638   Min.   :3.194  
##  1st Qu.:0.7323   1st Qu.:0.06002   1st Qu.:3.576  
##  Median :0.8067   Median :0.06602   Median :3.853  
##  Mean   :0.8278   Mean   :0.06956   Mean   :3.941  
##  3rd Qu.:0.9104   3rd Qu.:0.07895   3rd Qu.:4.335  
##  Max.   :1.0873   Max.   :0.09143   Max.   :4.922

Correlation table of the data:

cor(EDML_dat)
##             Q_trop_high Q_trop_mid Q_trop_low Q_stra_high Q_stra_mid Q_stra_low
## Q_trop_high   1.0000000  0.8829865  0.6166766   0.9995405  0.9188956  0.6084902
## Q_trop_mid    0.8829865  1.0000000  0.9135408   0.8793770  0.9962957  0.9092642
## Q_trop_low    0.6166766  0.9135408  1.0000000   0.6110331  0.8759230  0.9999210
## Q_stra_high   0.9995405  0.8793770  0.6110331   1.0000000  0.9162366  0.6028479
## Q_stra_mid    0.9188956  0.9962957  0.8759230   0.9162366  1.0000000  0.8709475
## Q_stra_low    0.6084902  0.9092642  0.9999210   0.6028479  0.8709475  1.0000000
## EDML          0.7488654  0.8929001  0.8517272   0.7455337  0.8826357  0.8491512
##                  EDML
## Q_trop_high 0.7488654
## Q_trop_mid  0.8929001
## Q_trop_low  0.8517272
## Q_stra_high 0.7455337
## Q_stra_mid  0.8826357
## Q_stra_low  0.8491512
## EDML        1.0000000

Simple linear regression

We select Q_stra_mid to build a simple linear regression to predict the EDML data. The reasons are that (1) it highly correlated to EDML and (2) a lot of 10Be is produced here.

q_stra_mid_lm <- lm(EDML ~ Q_stra_mid, data=EDML_dat)
summary(q_stra_mid_lm)
## 
## Call:
## lm(formula = EDML ~ Q_stra_mid, data = EDML_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49832 -0.12659 -0.00454  0.10797  0.73328 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.0235     0.1481   6.911 3.11e-10 ***
## Q_stra_mid    3.5249     0.1774  19.872  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2056 on 112 degrees of freedom
## Multiple R-squared:  0.779,  Adjusted R-squared:  0.7771 
## F-statistic: 394.9 on 1 and 112 DF,  p-value: < 2.2e-16

The independent variable is significant and can explain ~78% of the variation as indicated by \(R^2\). Below are the confidence intervals for the coefficient estimates.

confint(q_stra_mid_lm)
##                 2.5 %   97.5 %
## (Intercept) 0.7300233 1.316898
## Q_stra_mid  3.1734306 3.876344

Here, we should be careful when interpreting the coefficients. Since the EDML data was normalized, we cannot interpret the absolute value of the coefficients. If we take this literally, this would mean that 1 atom of 10Be produce in the mid latitude stratosphere corresponds to 3 atoms of 10Be coming to the Antarctica. This makes no sense! However, we can conclude that there is a strong positively linear relation between Q_stra_mid and EDML. The response variable is plotted versus the independent variable below. The plot includes the linear prediction (black solid line), its confidence interval (grey envelope) and its prediction interval (dashed black lines).

# Generate prediction interval
X <- data.frame(Q_stra_mid=seq(min(EDML_dat$Q_stra_mid),max(EDML_dat$Q_stra_mid),0.05))
pred_int <- cbind(X,predict(q_stra_mid_lm,X,interval="prediction"))
# Plot
ggplot(data=EDML_dat, aes(x=Q_stra_mid,y=EDML)) + 
  geom_point(alpha=0.7, color='blue') +
  geom_smooth(method="lm", color='black') +
  geom_line(data=pred_int,aes(x=Q_stra_mid,y=upr), color='black', linetype=2) + 
  geom_line(data=pred_int,aes(x=Q_stra_mid,y=lwr), color='black', linetype=2) 
## `geom_smooth()` using formula = 'y ~ x'

The linear model works quite well. Let’s assess the model fit: There is no time autocorrelation or trend in the residual.

EDML_pred_simple <- EDML_dat %>%
  select(EDML,Q_stra_mid) %>%
  mutate(fit=predict(q_stra_mid_lm,data=EDML_dat)) %>%
  mutate(res=residuals(q_stra_mid_lm)) %>%
  mutate(rstudent=rstudent(q_stra_mid_lm))
# Plot
ggplot(data=EDML_pred_simple, aes(x=as.numeric(row.names(EDML_pred_simple)),y=res)) + 
  geom_point() + geom_line() + labs(x='Index', y='Residual')

par(mar=c(4,4,4,4))
acf(EDML_pred_simple$res, main="Residual (Simple Linear)")

The variance of error terms seems fine.

ggplot(data=EDML_pred_simple, aes(x=fit,y=res)) + 
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=EDML_pred_simple, aes(x=fit,y=rstudent)) + 
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

Polynomial regression

Let’s try to fit the data with a simple polynomial regression.

# Create a squared variable
#EDML_pred_simple <- EDML_pred_simple %>% 
#  mutate(Q_stra_mid2=Q_stra_mid**2)
# Fit
q_stra_mid_poly <- lm(EDML ~ poly(Q_stra_mid,2),
                      data=EDML_pred_simple)
summary(q_stra_mid_poly)
## 
## Call:
## lm(formula = EDML ~ poly(Q_stra_mid, 2), data = EDML_pred_simple)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50001 -0.11796 -0.00382  0.11053  0.72840 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.94147    0.01933 203.860   <2e-16 ***
## poly(Q_stra_mid, 2)1  4.08531    0.20643  19.790   <2e-16 ***
## poly(Q_stra_mid, 2)2 -0.05810    0.20643  -0.281    0.779    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2064 on 111 degrees of freedom
## Multiple R-squared:  0.7792, Adjusted R-squared:  0.7752 
## F-statistic: 195.9 on 2 and 111 DF,  p-value: < 2.2e-16

This model does not help explain more variation in the EDML data and the coefficients are not significant. The plot below with the fitted model also shows that the relation is rather not polynomial.

X <- data.frame(Q_stra_mid=seq(0.6,1.1,0.1),
                Q_stra_mid2=seq(0.6,1.1,0.1)**2)
pred_int <- cbind(X,predict(q_stra_mid_poly,X,interval="prediction"))
conf_int <- cbind(X,predict(q_stra_mid_poly,X,interval="confidence"))
# Plot
ggplot(data=EDML_pred_simple, aes(x=Q_stra_mid,y=EDML)) + 
  geom_point(alpha=0.7, color='blue') +
  geom_line(data=pred_int,aes(x=Q_stra_mid,y=fit)) + 
  geom_line(data=conf_int,aes(x=Q_stra_mid,y=upr), color='orange') + 
  geom_line(data=conf_int,aes(x=Q_stra_mid,y=lwr), color='orange') +  
  geom_line(data=pred_int,aes(x=Q_stra_mid,y=upr), color='black', linetype=2) + 
  geom_line(data=pred_int,aes(x=Q_stra_mid,y=lwr), color='black', linetype=2) 

Multiple regression model

Let’s try to use all of the variables to predict EDML.

all_lm <- lm(EDML ~ ., data=EDML_dat)
summary(all_lm)
## 
## Call:
## lm(formula = EDML ~ ., data = EDML_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46695 -0.11348 -0.01852  0.11081  0.66198 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    5.573      4.720   1.181    0.240
## Q_trop_high  -39.817     94.061  -0.423    0.673
## Q_trop_mid   107.423    223.773   0.480    0.632
## Q_trop_low  -321.301    423.007  -0.760    0.449
## Q_stra_high    7.830     15.309   0.511    0.610
## Q_stra_mid   -38.798     75.660  -0.513    0.609
## Q_stra_low   813.734    942.588   0.863    0.390
## 
## Residual standard error: 0.1948 on 107 degrees of freedom
## Multiple R-squared:  0.8106, Adjusted R-squared:  0.7999 
## F-statistic:  76.3 on 6 and 107 DF,  p-value: < 2.2e-16

This multi-linear model does not help explain more variation in the EDML data and none of the coefficients are significant. We believe that only a subset of the variables are useful since the variables are strongly correlated among themselves. Let’s only select 2 regions in the stratosphere that are relevant and highly correlated to the EDML data Q_stra_mid and Q_stra_high.

stra_lm <- lm(EDML ~ Q_stra_mid + Q_stra_high, data=EDML_dat)
summary(stra_lm)
## 
## Call:
## lm(formula = EDML ~ Q_stra_mid + Q_stra_high, data = EDML_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50037 -0.09150 -0.00279  0.10930  0.67088 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.0809     0.1410   7.667  7.2e-12 ***
## Q_stra_mid    4.9649     0.4190  11.850  < 2e-16 ***
## Q_stra_high  -0.6710     0.1789  -3.751 0.000281 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1945 on 111 degrees of freedom
## Multiple R-squared:  0.8039, Adjusted R-squared:  0.8004 
## F-statistic: 227.5 on 2 and 111 DF,  p-value: < 2.2e-16

This multi-linear model can explain 80% variation in the observation data which is slightly better than the simple model. Both of the independent variables are relevant.

confint(stra_lm)
##                  2.5 %     97.5 %
## (Intercept)  0.8015552  1.3602752
## Q_stra_mid   4.1347146  5.7951463
## Q_stra_high -1.0253765 -0.3165372

And again the residual looks fine.

EDML_pred_multi <- EDML_dat %>%
  select(EDML,Q_stra_mid,Q_stra_high) %>%
  mutate(fit=predict(stra_lm,data=EDML_dat)) %>%
  mutate(res=residuals(stra_lm)) %>%
  mutate(rstudent=rstudent(stra_lm))
# Plot
ggplot(data=EDML_pred_multi, aes(x=as.numeric(row.names(EDML_pred_multi)),y=res)) + 
  geom_point() + geom_line() + labs(x='Index', y='Residual') +theme_bw() 

par(mar=c(4,4,4,4))
plot(acf(EDML_pred_multi$res, plot=FALSE),
     main="Residual (Simple Linear)")

ggplot(data=EDML_pred_multi, aes(x=fit,y=res)) + 
  geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=EDML_pred_multi, aes(x=fit,y=rstudent)) + 
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

The plot below shows the prediction of EDML by Q_stra_mid where Q_stra_high values = 1.5, 2.0 and 2.5 (indicated by the numbers below the prediction lines).

# Generate prediction data
X15 <- data.frame(Q_stra_mid=seq(0.6,1.1,0.1),
                Q_stra_high=rep(1.5,6))
X20 <- data.frame(Q_stra_mid=seq(0.6,1.1,0.1),
                Q_stra_high=rep(2.0,6))
X25 <- data.frame(Q_stra_mid=seq(0.6,1.1,0.1),
                Q_stra_high=rep(2.5,6))
pred_int15 <- cbind(X15,predict(stra_lm,X15,interval="prediction"))
pred_int20 <- cbind(X20,predict(stra_lm,X20,interval="prediction"))
pred_int25 <- cbind(X25,predict(stra_lm,X25,interval="prediction"))
# Plot
ggplot(data=EDML_pred_multi, aes(x=Q_stra_mid,y=EDML)) + 
  geom_point(alpha=0.7, color='blue') +
  geom_line(data=pred_int15,aes(x=Q_stra_mid,y=fit), linetype=2) +
  geom_line(data=pred_int20,aes(x=Q_stra_mid,y=fit), linetype=2) + 
  geom_line(data=pred_int25,aes(x=Q_stra_mid,y=fit), linetype=2) +
  geom_text(x=1.1,y=4.7,label="2.5") + 
  geom_text(x=1.1,y=5.1,label="2.0") + 
  geom_text(x=1.1,y=5.4,label="1.5") 

The plot below shows the prediction of EDML by Q_stra_high where Q_stra_mid values = 0.7, 0.9 and 1.1 (indicated by the numbers below the prediction lines).

# Generate prediction data
X07 <- data.frame(Q_stra_high=seq(1.3,2.7,0.1),
                Q_stra_mid=rep(0.7,15))
X09 <- data.frame(Q_stra_high=seq(1.3,2.7,0.1),
                Q_stra_mid=rep(0.9,15))
X11 <- data.frame(Q_stra_high=seq(1.3,2.7,0.1),
                Q_stra_mid=rep(1.1,15))

pred_int07 <- cbind(X07,predict(stra_lm,X07,interval="prediction"))
pred_int09 <- cbind(X09,predict(stra_lm,X09,interval="prediction"))
pred_int11 <- cbind(X11,predict(stra_lm,X11,interval="prediction"))
# Plot
ggplot(data=EDML_pred_multi, aes(x=Q_stra_high,y=EDML)) + 
  geom_point(alpha=0.7, color='blue') +
  geom_line(data=pred_int07,aes(x=Q_stra_high,y=fit), linetype=2) +
  geom_line(data=pred_int09,aes(x=Q_stra_high,y=fit), linetype=2) + 
  geom_line(data=pred_int11,aes(x=Q_stra_high,y=fit), linetype=2) +
  geom_text(x=1.3,y=3.5,label="0.7") + 
  geom_text(x=1.3,y=4.5,label="0.9") + 
  geom_text(x=1.3,y=5.5,label="1.1") 

The plots show the negative correlation between Q_stra_high and EDML. However, by looking at the second plot (Q_stra_high vs EDML) the correlation is rather positive. So despite the fact that the multi-linear model can explain extra variation, we suggest to use the simple linear regression with one variable.

PCA

Since the independent variables are strongly correlated we believe that PCA will help in combining them into new useful variables. PC loadings:

pca_dat <- EDML_dat %>%
  select(Q_stra_low,Q_stra_mid,Q_stra_high,
         Q_trop_low,Q_trop_mid,Q_trop_high)
pr.out <- prcomp(pca_dat, scale = FALSE)
pr.out$rotation
##                     PC1         PC2          PC3         PC4         PC5
## Q_stra_low  -0.02401422 -0.15145143  0.276092510 -0.05395517  0.33090402
## Q_stra_mid  -0.36378253 -0.70901258 -0.527058441 -0.13840200  0.25149546
## Q_stra_high -0.90030159  0.37888603  0.124574888 -0.16337697 -0.05919754
## Q_trop_low  -0.06732330 -0.41586210  0.782268557 -0.10389355  0.20313615
## Q_trop_mid  -0.15753082 -0.39241770  0.135914314  0.36573761 -0.79204439
## Q_trop_high -0.16490678  0.06225805 -0.006828406  0.89815579  0.39389035
##                     PC6
## Q_stra_low  -0.88760919
## Q_stra_mid   0.06904885
## Q_stra_high -0.01367975
## Q_trop_low   0.39815076
## Q_trop_mid  -0.20401319
## Q_trop_high  0.08396208

How many PCs do we need?

pr.var <- pr.out$sdev^2
pve <- pr.var / sum(pr.var)
df_pve <- data.frame(pve=pve,PC=c(1:6))
plot1 <- df_pve %>% ggplot(aes(x=PC,y=pve))+
  geom_point(size=3)+
  geom_line() + 
  labs(y="Proportion of variance explained") +  
  theme(text=element_text(size=16))
plot1

So most of the variation in our independent variables can be explained by PC1 and PC2 (to a lesser degree). Note that we used scale = FALSE to account for the differences in the amount of 10Be produced in different regions. PC1 is dominated by Q_stra_high and Q_stra_mid as we expected. PC2 contains the variations in Q_stra_mid that are negatively correlated with Q_stra_high. Let’s look at the relationship between PC1, PC2 and EDML.

EDML_dat$PC1 <- pr.out$x[,1]
EDML_dat$PC2 <- pr.out$x[,2]
ggplot(data=EDML_dat, aes(x=PC1,y=EDML)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=EDML_dat, aes(x=PC2,y=EDML)) + 
  geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'

We can see that PC1 is positively correlated with EDML while PC2 is negatively correlated with EDML. Let’s predict EDML with a multi-linear regression model consists of PC1 and PC2

pca_lm <- lm(EDML ~ PC1 + PC2, data=EDML_dat)
summary(pca_lm)
## 
## Call:
## lm(formula = EDML ~ PC1 + PC2, data = EDML_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49924 -0.09448 -0.00037  0.10865  0.66798 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.94147    0.01818  216.77   <2e-16 ***
## PC1         -1.20214    0.06456  -18.62   <2e-16 ***
## PC2         -3.78308    0.35951  -10.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1941 on 111 degrees of freedom
## Multiple R-squared:  0.8047, Adjusted R-squared:  0.8012 
## F-statistic: 228.7 on 2 and 111 DF,  p-value: < 2.2e-16
confint(pca_lm)
##                 2.5 %    97.5 %
## (Intercept)  3.905441  3.977501
## PC1         -1.330073 -1.074205
## PC2         -4.495472 -3.070679

The negative coefficient of PC2 is now making sense. PC2 collects the variation in Q_stra_mid that is negatively correlated with EDML. Again the residual looks fine.

EDML_pred_pca <- EDML_dat %>%
  select(EDML,PC1,PC2) %>%
  mutate(fit=predict(pca_lm,data=EDML_dat)) %>%
  mutate(res=residuals(pca_lm)) %>%
  mutate(rstudent=rstudent(pca_lm))
# Plot
ggplot(data=EDML_pred_pca, aes(x=as.numeric(row.names(EDML_pred_pca)),y=res)) + 
  geom_point() + geom_line() + labs(x='Index', y='Residual') +theme_bw() 

par(mar=c(4,4,4,4))
plot(acf(EDML_pred_pca$res, plot=FALSE),main="Residual (Linear fit with PCA 1 and 2)")

ggplot(data=EDML_pred_pca, aes(x=fit,y=res)) + 
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=EDML_pred_pca, aes(x=fit,y=rstudent)) + 
  geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

TS

Greenland (GRIP) ice core

Simple Forecasting

trend_mod_GRIP <- GRIP_train_ts %>%
  model(linear=TSLM(GRIP ~trend()),
        piecewise=TSLM(GRIP~trend(knots=c(-7200,-2200)))
        )

fc_trend_mod_GRIP <- trend_mod_GRIP %>% 
  fabletools::forecast(h=20)
GRIP_train_ts %>% 
  autoplot(GRIP)+
  geom_line(data=fitted(trend_mod_GRIP),
            aes(y=.fitted, color=.model))+
  autolayer(fc_trend_mod_GRIP, alpha=0.6, level=95)+
  labs(title="Trend models and forecasts")

Smoothing

Lets try some time series smoothing First, here’s a 9 period moving average. Pretty strong trend.

GRIP_train_ts %>% 
  mutate(`GRIP-9-MA`=slider::slide_dbl(GRIP, mean, 
             .before=4, .after=4, .complete=TRUE)) %>% 
  autoplot(GRIP)+
  geom_line(aes(y=`GRIP-9-MA`), color="orange")
## Warning: Removed 8 rows containing missing values (`geom_line()`).

Right now all 9 points in the smoothing window are equally weighted. Exponential smoothing creates a kernel of weights which attenuate away from the current point backward. Here’s a model with simple exponential smoothing.

es_mod_GRIP <- GRIP_train_ts %>% 
  model(ETS(GRIP~error("A")+trend("N")))
fc_es_mod_GRIP <- es_mod_GRIP %>% 
  fabletools::forecast(h=20)

fc_es_mod_GRIP %>% 
  autoplot(GRIP_train_ts)+
  geom_line(data=augment(es_mod_GRIP),aes(y=.fitted), col="orange")+
  labs(title="Simple exponential smoothing")

Let’s try and incorporate the trend. This is using Holt’s linear trend method.

es_mod_GRIP <- GRIP_train_ts %>% 
  model(AAN=ETS(GRIP~error("A")+trend("A")))
fc_es_mod_GRIP <- es_mod_GRIP %>% 
  fabletools::forecast(h=20)

fc_es_mod_GRIP %>% 
  autoplot(GRIP_train_ts)+
  geom_line(data=augment(es_mod_GRIP),aes(y=.fitted), col="orange")+
  labs(title="AAN exponential smoothing")

As you can see the forecast picked up the downward sloping global linear trend.

Damped linear trend method corrects the infinite Holt’s trend projected into the future and instead attenuates the trend towards a flat line in the remote future. Let’s add it and compare.

es_mod_GRIP <- GRIP_train_ts %>% 
  model(AAN=ETS(GRIP~error("A")+trend("A")),
        AAdN=ETS(GRIP~error("A")+trend("Ad", phi=0.9)))
fc_es_mod_GRIP <- es_mod_GRIP %>% 
  fabletools::forecast(h=20)

fc_es_mod_GRIP %>% 
  autoplot(GRIP_train_ts)+
  geom_line(data=augment(es_mod_GRIP),aes(y=.fitted), col="orange")+
  labs(title="AAN vs AAdN exponential smoothing")

Time series regression

Let’s try to build a regression model. The time-indexed regression model of the form

\[y_t=\beta_0+\beta_1x_{1,t}+\beta_2x_{2,t}+\dots+\beta_kx_{k,t}+\varepsilon_t.\]

It makes the following assumptions about the errors \((\varepsilon_1,\varepsilon_2,\dots,\varepsilon_T)\):

  • errors are unbiased (having a mean of zero)
  • errors are not autocorrelated (there’s no remaining trend)
  • errors are not related to the preditors (all signal has been extracted)
  • errors are normally distributed with constant variance (for the purposes of making predictive intervals)
all_mod_GRIP  <- GRIP_train_ts %>%  
  model(TSLM(GRIP~Q_trop_high+Q_trop_mid+Q_trop_low+
               Q_stra_high+Q_stra_mid+Q_stra_low)) 
report(all_mod_GRIP)
## Series: GRIP 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76789 -0.22535 -0.01726  0.19074  1.18938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    5.151      3.816   1.350    0.178
## Q_trop_high -118.645     78.718  -1.507    0.133
## Q_trop_mid   259.711    187.770   1.383    0.167
## Q_trop_low  -442.566    351.375  -1.260    0.209
## Q_stra_high   20.285     12.843   1.579    0.115
## Q_stra_mid   -87.582     63.651  -1.376    0.170
## Q_stra_low   978.398    783.482   1.249    0.213
## 
## Residual standard error: 0.3396 on 378 degrees of freedom
## Multiple R-squared: 0.6676,  Adjusted R-squared: 0.6623
## F-statistic: 126.5 on 6 and 378 DF, p-value: < 2.22e-16

None of the predictors is significant. How nice! Let’s see the predicted values

augment(all_mod_GRIP) %>% 
  ggplot(aes(x=time))+
  geom_line(aes(y=GRIP, color="Data"))+
  geom_line(aes(y=.fitted, color="Fitted"))+
  scale_color_manual(values=c(Data="black", Fitted="orange"))

augment(all_mod_GRIP) %>% 
  ggplot(aes(x=GRIP,y=.fitted))+
  geom_point()+
  geom_abline(intercept = 0, slope = 1)

all_mod_GRIP %>% gg_tsresiduals()

The plot shows significant autocorrelation in the residuals. We can try and plot the residuals against the covariates.

icecore_df %>% 
  left_join(residuals(all_mod_GRIP), by="time") %>% 
  pivot_longer(starts_with("Q_"), names_to = "covariate", values_to="value") %>% 
  ggplot(aes(x=value, y=.resid))+
  geom_point()+
  facet_wrap(vars(covariate), scales = "free_x")+
  labs(title="Residuals vs predictors")
## Warning: Removed 96 rows containing missing values (`geom_point()`).

Lets plot our residuals against the fitted values

augment(all_mod_GRIP) %>% 
  ggplot(aes(x=.fitted, y=.resid))+
  geom_point()+labs(title="Residuals vs fitted")

glance(all_mod_GRIP) %>% 
  select(adj_r_squared, CV, AIC, AICc, BIC)
## # A tibble: 1 × 5
##   adj_r_squared    CV   AIC  AICc   BIC
##           <dbl> <dbl> <dbl> <dbl> <dbl>
## 1         0.662 0.118 -823. -822. -791.
fc_all_mod_GRIP <- fabletools::forecast(all_mod_GRIP,
                                        new_data=GRIP_test_ts)
GRIP_train_ts %>% 
  autoplot(GRIP)+
  autolayer(fc_all_mod_GRIP)+
  labs(title="GRIP core readings and forecast")

ARIMA regression

There seems to be quite a lot of signal left in the error term of our regression. We can try to extract this signal by fitting an ARIMA model to the residuals. Let’s start with a single covariate:

\[y_t=\beta_0+\beta_1x_t+\eta_t\]

where \(\eta_t\) is an ARIMA model. The order of the model can be specified explicitly, but can also be left up to the engine to select.

ARIMA_mod_GRIP <- GRIP_train_ts %>% 
  model(ARIMA(GRIP~Q_stra_high))
report(ARIMA_mod_GRIP)
## Series: GRIP 
## Model: LM w/ ARIMA(2,1,2) errors 
## 
## Coefficients:
##           ar1     ar2      ma1      ma2  Q_stra_high
##       -0.0234  0.2080  -0.2503  -0.6374       1.1532
## s.e.   0.1439  0.1108   0.1279   0.1246       0.0640
## 
## sigma^2 estimated as 0.05513:  log likelihood=13.46
## AIC=-14.92   AICc=-14.7   BIC=8.78

The function picked the linear model with ARIMA(2,1,2) errors for us. Note that the intercept is gone due to the differencing. The way we can interpret these coefficients is:

\[ \begin{gathered} y_t=1.153x_t+ \eta_t,\\ \eta_t=-0.023\eta_{t-1}+0.2080\eta_{t-2}+\varepsilon_t-0.2503\varepsilon_{t-1}-0.6374\varepsilon_{t-2},\\ \varepsilon_t \sim NID(0,0.05513) \end{gathered} \]

ARIMA_mod_GRIP %>% gg_tsresiduals()

augment(ARIMA_mod_GRIP) %>% 
  features(.innov, ljung_box, dof=3, lag=8)
## # A tibble: 1 × 3
##   .model                    lb_stat lb_pvalue
##   <chr>                       <dbl>     <dbl>
## 1 ARIMA(GRIP ~ Q_stra_high)    3.37     0.643

This is really good! The residuals looks nice and the forecast should be sensible. Lets predict!

fabletools::forecast(ARIMA_mod_GRIP, new_data=GRIP_test_ts) %>% 
  autoplot(GRIP_train_ts)+
  labs("ARIMA regression forecast")

We should now see if we can improve the goodness of fit by adding more covariates. As you can remember from the correlogram plot, the covariates from the same latitude (high, med, low) vere strongly correlated. Perhaps we could add only one of each.

ARIMA_mods <-GRIP_train_ts %>% 
  model("stra_high"=ARIMA(GRIP~Q_stra_high),
        "trop_high"=ARIMA(GRIP~Q_trop_high),
        "all_stra"=ARIMA(GRIP~Q_stra_high+Q_stra_mid+Q_stra_low),
        "all_trop"=ARIMA(GRIP~Q_trop_high+Q_trop_mid+Q_trop_low),
        "all_high"=ARIMA(GRIP~Q_trop_high+Q_stra_high),
        "all_highmid"=ARIMA(GRIP~Q_trop_high+Q_stra_high+Q_trop_mid+Q_stra_mid),
        "all_highlow"=ARIMA(GRIP~Q_trop_high+Q_stra_high+Q_trop_low+Q_stra_low),
        "all"=ARIMA(GRIP~Q_trop_high+Q_stra_high+Q_trop_mid+Q_stra_mid+Q_stra_low+Q_trop_low)
        )
glance(ARIMA_mods)
## # A tibble: 8 × 8
##   .model      sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>        <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 stra_high   0.0551    13.5 -14.9 -14.7  8.78 <cpl [2]> <cpl [2]>
## 2 trop_high   0.0553    12.9 -13.7 -13.5  9.97 <cpl [2]> <cpl [2]>
## 3 all_stra    0.0540    18.3 -20.6 -20.2 11.0  <cpl [2]> <cpl [2]>
## 4 all_trop    0.0542    17.7 -19.3 -18.9 12.3  <cpl [2]> <cpl [2]>
## 5 all_high    0.0553    13.5 -13.0 -12.7 14.6  <cpl [2]> <cpl [2]>
## 6 all_highmid 0.0541    18.4 -18.8 -18.3 16.8  <cpl [2]> <cpl [2]>
## 7 all_highlow 0.0541    18.5 -18.9 -18.5 16.6  <cpl [2]> <cpl [2]>
## 8 all         0.0544    18.6 -15.1 -14.4 28.3  <cpl [2]> <cpl [2]>

Looking at the AIC or the AICc, the best model seems to be with the variables from high latitude (“Allhigh”), while BIC (which penalizes model complexity a little more) favors our original model with a single high latitude stratosphere variable(“Strahigh”).

Lets look at the “Allhigh” model details.

GRIP_allhigh_mod <- select(ARIMA_mods, "all_high")

GRIP_allhigh_mod %>% report()
## Series: GRIP 
## Model: LM w/ ARIMA(2,1,2) errors 
## 
## Coefficients:
##           ar1     ar2      ma1      ma2  Q_trop_high  Q_stra_high
##       -0.0214  0.2059  -0.2531  -0.6347      -2.3829       1.5881
## s.e.   0.1460  0.1121   0.1304   0.1269       7.6652       1.4004
## 
## sigma^2 estimated as 0.05527:  log likelihood=13.51
## AIC=-13.02   AICc=-12.72   BIC=14.63

Lets look at the residuals.

gg_tsresiduals(GRIP_allhigh_mod)+
  labs(title="Residuals diagnostic plots",
       subtitle = "Allhigh model")

Just awesome! No autocorrelation in residuals, nice distribution. No obvious pattern. A few more diagnostic plots to check the residuals.

augment(GRIP_allhigh_mod) %>% 
  ggplot(aes(x=GRIP,y=.fitted))+
  geom_point()+
  geom_abline(intercept = 0, slope = 1)+
  labs(title="Predicted vs fitted",
       subtitle = "Allhigh model")

icecore_df %>% 
  left_join(residuals(GRIP_allhigh_mod), by="time") %>% 
  pivot_longer(starts_with("Q_"), names_to = "covariate", values_to="value") %>% 
  ggplot(aes(x=value, y=.resid))+
  geom_point()+
  facet_wrap(vars(covariate), scales = "free_x")+
  labs(title="Residuals vs predictors", subtitle = "Allhigh model")
## Warning: Removed 96 rows containing missing values (`geom_point()`).

Lets plot our residuals against the fitted values

augment(GRIP_allhigh_mod) %>% 
  ggplot(aes(x=.fitted, y=.resid))+
  geom_point()+labs(title="Residuals vs fitted", subtitle = "Allhigh model")

No particular pattern here. All good, it seems.

Cross-validation

We will use time series cross-validation to decide which model to use for prediction. We cross-validate the performance of the simple “Strahigh” model against the more complex “Allhigh”.

GRIP_train_stretched <- GRIP_train_ts %>% 
  stretch_tsibble(.init=100, .step=50) 
GRIP_cv_mod <- GRIP_train_stretched %>% 
  model("strahigh"=ARIMA(GRIP~0+pdq(2,1,2)+Q_stra_high),
        "allhigh"=ARIMA(GRIP~0+pdq(2,1,2)+Q_stra_high+Q_trop_high))

We now prepare the validation sets and predict on them, calculating the performance. We will predict 16 observations at a time (the size of our test set).

GRIP_cv_valid_ts <- new_data(GRIP_train_stretched, n=16) %>% 
  left_join(GRIP_train_ts, by="time")

GRIP_cv_mod_fc <- fabletools::forecast(GRIP_cv_mod, new_data=GRIP_cv_valid_ts) %>% 
  group_by(.id, .model) %>% 
  mutate(h=row_number()) %>% 
  ungroup() %>% 
  as_fable(response="GRIP", distribution=GRIP)

GRIP_cv_mod_fc %>% 
  fabletools::accuracy(GRIP_train_ts, by=c("h", ".model")) %>% 
  group_by(.model, .type) %>% 
  summarize_all(mean)
## # A tibble: 2 × 11
## # Groups:   .model [2]
##   .model   .type     h     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 allhigh  Test    8.5 -0.153 0.316 0.267 -4.34  7.12 0.981 0.895 -0.128
## 2 strahigh Test    8.5 -0.149 0.309 0.259 -4.21  6.91 0.954 0.876 -0.127

The original model is better in every respect! BIC was right (as always).

Let’s predict using the Q_stra_high variable.

ARIMA_mod_GRIP %>% 
  fabletools::forecast(new_data=GRIP_test_ts) %>% 
  autoplot(GRIP_train_ts)+
  labs(title="Prediction from ARIMA model",
       subtitle="Strahigh model")

Antarctica (EDML) ice core

ARIMA regression

Let’s save ourselves some time and go straight to the ARIMA regression for EDML response variable

all_mod_EDML <- EDML_train_ts %>% 
  model("stra_low"=ARIMA(EDML~Q_stra_low),
        "trop_low"=ARIMA(EDML~Q_trop_low),
        "all_stra"=ARIMA(EDML~Q_stra_high+Q_stra_mid+Q_stra_low),
        "all_trop"=ARIMA(EDML~Q_trop_high+Q_trop_mid+Q_trop_low),
        "all_low"=ARIMA(EDML~Q_stra_low+Q_trop_low),
        "all_lowmid"=ARIMA(EDML~Q_stra_low+Q_trop_low+Q_stra_mid+Q_trop_mid),
        "all_lowhigh"=ARIMA(EDML~Q_stra_low+Q_trop_low+Q_stra_high+Q_trop_high),
        "all"=ARIMA(EDML~Q_stra_high+Q_stra_mid+Q_stra_low+
                      Q_trop_high+Q_trop_mid+Q_trop_low))
glance(all_mod_EDML)
## # A tibble: 8 × 8
##   .model      sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>        <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 stra_low    0.0179    203. -393. -392. -366. <cpl [3]> <cpl [1]>
## 2 trop_low    0.0179    203. -392. -392. -365. <cpl [3]> <cpl [1]>
## 3 all_stra    0.0175    209. -396. -395. -354. <cpl [5]> <cpl [1]>
## 4 all_trop    0.0174    209. -397. -396. -355. <cpl [5]> <cpl [1]>
## 5 all_low     0.0174    209. -398. -397. -360. <cpl [4]> <cpl [2]>
## 6 all_lowmid  0.0176    208. -393. -392. -351. <cpl [4]> <cpl [2]>
## 7 all_lowhigh 0.0179    204. -390. -389. -355. <cpl [3]> <cpl [1]>
## 8 all         0.0174    211. -397. -396. -351. <cpl [3]> <cpl [1]>

BIC prefers the “lowmid” model, while the AIC favors the “lowhigh” model.

Let’s pick one of them and look at the residuals.

all_mod_EDML %>% select("all_lowmid") %>% 
  gg_tsresiduals()

Residuals look largely ok with only two periods “sticking out” of the confidence bands: at lags 14 and 22 (no obvious reason why). I am concerned about the non-stationary variance of the residuals and, therefore, slightly irregular shape of the error distribution.

all_mod_EDML %>% select("all_lowmid") %>% 
  report()
## Series: EDML 
## Model: LM w/ ARIMA(4,0,2) errors 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4      ma1      ma2  Q_stra_low
##       1.7300  -1.2167  0.5075  -0.1376  -0.1061  -0.6246   -117.4080
## s.e.  0.1142   0.1718  0.1495   0.0697   0.0971   0.0841     75.5956
##       Q_trop_low  Q_stra_mid  Q_trop_mid
##          57.0437     -0.9965      0.4834
## s.e.     30.6785      3.4000      9.4865
## 
## sigma^2 estimated as 0.01762:  log likelihood=207.63
## AIC=-393.26   AICc=-392.46   BIC=-351.14

This is a pretty complex model: 4 covariates, 4 lags, and 2 moving averages.

The alternative model’s residuals

all_mod_EDML %>% select("all_lowhigh") %>% 
  gg_tsresiduals()

Here we have a significant autoregression at the lag 7 and 22.

all_mod_EDML %>% select("all_lowhigh") %>% 
  report()
## Series: EDML 
## Model: LM w/ ARIMA(3,0,1) errors 
## 
## Coefficients:
##          ar1      ar2     ar3     ma1  Q_stra_low  Q_trop_low  Q_stra_high
##       0.8464  -0.3943  0.0530  0.7735   -163.4156     72.1986       0.3493
## s.e.  0.0807   0.0929  0.0665  0.0461     53.7200     19.3986       0.6953
##       Q_trop_high
##           -3.0112
## s.e.       4.0055
## 
## sigma^2 estimated as 0.01791:  log likelihood=203.95
## AIC=-389.91   AICc=-389.36   BIC=-355.45

This is somewhat simpler model with 3 lags, 1 moving average and 4 variables.

Cross-validation

We will employ the “stretching window” cross-validation scheme to assess the performance of the selected number of models on the training data (effectively treating part of it as a validation set). Here we will set the initial training set to be 100 observations and then at every iteration we will add 61 to the training set, always predicting 61 observations (equivalent to the size of the test set we are trying to predict).

EDML_stretched <- EDML_train_ts %>% 
  stretch_tsibble(.init=100, .step=61) 
EDML_cv_mod <- EDML_stretched %>% 
  model("all_lowmid"=ARIMA(EDML~0+pdq(4,0,2)+Q_stra_low+Q_trop_low+Q_stra_mid+Q_trop_mid),
        "all_lowhigh"=ARIMA(EDML~0+pdq(3,0,1)+Q_stra_low+Q_trop_low+Q_stra_high+Q_trop_high))
EDML_cv_valid_ts <- new_data(EDML_stretched, n=61) %>% 
  left_join(EDML_train_ts, by="time")

EDML_cv_mod_fc <- fabletools::forecast(EDML_cv_mod, new_data=EDML_cv_valid_ts) %>% 
  group_by(.id, .model) %>% 
  mutate(h=row_number()) %>% 
  ungroup() %>% 
  as_fable(response="EDML", distribution=EDML)

EDML_cv_mod_fc %>% fabletools::accuracy(EDML_train_ts, by=c("h", ".model")) %>% 
  group_by(.model, .type) %>% 
  summarize_all(mean)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 4 observations are missing between -1200 and -1140
## # A tibble: 2 × 11
## # Groups:   .model [2]
##   .model      .type     h     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>       <chr> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 all_lowhigh Test     31 0.0293 0.273 0.223 0.277  5.77  1.31  1.26 -0.139
## 2 all_lowmid  Test     31 0.0777 0.290 0.245 1.62   6.35  1.44  1.34 -0.160

The “lowhigh” is slightly better on all metrics.

all_mod_EDML %>% select(all_lowhigh) %>% 
  fabletools::forecast(new_data=EDML_test_ts) %>% 
  autoplot(EDML_train_ts)+
  labs(title="Prediction from ARIMA model",
       subtitle="Lowhigh model")